In this notebook, we perform a clustering analysis of the CD7 data on features extracted from SCIP.
%load_ext autoreload
%autoreload 2
from scip_workflows.common import *
import warnings
import anndata
import flowutils
import scanpy
import scipy.stats
from kneed import KneeLocator
from scipy.cluster import hierarchy
from scipy.cluster.hierarchy import fcluster
from scipy.spatial.distance import squareform
from sklearn.feature_selection import VarianceThreshold, mutual_info_classif
from sklearn.preprocessing import scale
scanpy.settings.verbosity = 3
from scip.features import intensity
props = intensity.props.copy()
props.remove("kurtosis")
props.remove("skewness")
def asinh_scale(x, t):
return scale(
flowutils.transforms.asinh(x, channel_indices=None, t=t, m=4.5, a=1),
with_std=False,
)
plt.rcParams["figure.dpi"] = 150
try:
features = snakemake.input.features
index = snakemake.input.index
columns = snakemake.input.columns
fillna = bool(int(snakemake.wildcards.fillna))
output_adata = snakemake.output.adata
output_intensity_distribution = snakemake.output.intensity_distribution
except NameError:
# data_dir = Path("/data/gent/vo/000/gvo00070/vsc42015/datasets/cd7/800/results/scip/202203221745/")
data_dir = Path("/home/maximl/scratch/data/vsc/datasets/cd7/800/scip/061020221736/")
features = data_dir / "features.parquet"
index = data_dir / "indices" / "index.npy"
columns = data_dir / "indices" / "columns.npy"
fillna = False
output_adata = data_dir / f"CD7_adata_{int(fillna)}.pickle"
output_intensity_distribution = data_dir / "figures" / f"CD7_intensity_distribution.pickle"
df = pq.read_table(features).to_pandas()
df = df.set_index(["meta_panel", "meta_replicate", "meta_P", "meta_id"])
df = df.loc["D"]
df = df[[c for c in numpy.load(columns, allow_pickle=True) if c in df.columns]]
df = df.loc[numpy.load(index, allow_pickle=True)]
df = df.sort_index()
df.shape
(29959, 1188)
from sklearn.feature_selection import VarianceThreshold
var = VarianceThreshold().fit(df.filter(regex="feat"))
df = pandas.concat(
[df.filter(regex="feat").iloc[:, var.get_support()], df.filter(regex="meta")],
axis=1,
)
df.isna().all(axis=0).any()
False
df.filter(regex="feat").isna().all(axis=1).sum()
0
seaborn.histplot(data=df.isna().sum())
<AxesSubplot:ylabel='Count'>
if fillna:
df = df.fillna(0)
else:
df = df.drop(columns=df.columns[df.isna().sum() > 0])
obs = df.filter(regex="meta").reset_index()
obs.index = df.index
adata = anndata.AnnData(df.filter(regex="feat").astype(numpy.float32), obs=obs)
adata.raw = adata.copy()
adata.obs["meta_replicate"] = adata.obs["meta_replicate"].astype("category")
markers = [col for col in adata.var.index if col.startswith("feat_sum")]
adata_pre = adata.copy()
%%time
scanpy.pp.scale(adata_pre)
scanpy.tl.pca(adata_pre, svd_solver='arpack')
scanpy.pp.neighbors(adata_pre, n_neighbors=30)
scanpy.tl.umap(adata_pre)
scanpy.pl.umap(adata_pre, color=["meta_replicate"])
computing PCA
with n_comps=50
finished (0:00:00)
computing neighbors
using 'X_pca' with n_pcs = 50
finished: added to `.uns['neighbors']`
`.obsp['distances']`, distances for each pair of neighbors
`.obsp['connectivities']`, weighted adjacency matrix (0:00:08)
computing UMAP
finished: added
'X_umap', UMAP coordinates (adata.obsm) (0:00:44)
CPU times: user 1min 56s, sys: 15.4 s, total: 2min 11s Wall time: 54.6 s
discrete = ["median", "area", "euler"]
discrete_cols = [c for c in adata.var_names if any(d in c for d in discrete)]
discrete_cols_i = [
i for i, c in enumerate(adata.var_names) if any(d in c for d in discrete)
]
adata.var["is_marker"] = [
any(n.endswith("feat_combined_sum_%s" % m) for m in ["DAPI", "EGFP", "RPe", "APC"])
for n in adata.var_names
]
adata.var["do_asinh"] = [
(any(m in n for m in ["DAPI", "EGFP", "RPe", "APC"]) and any(o in n for o in props))
for n in adata.var_names
]
%%time
with warnings.catch_warnings():
warnings.simplefilter("ignore")
sc_df = scanpy.get.obs_df(adata, keys=adata.var_names.to_list())
sc_df[adata.var_names[adata.var.do_asinh].to_list()] = sc_df[adata.var_names[adata.var.do_asinh]].groupby(["meta_replicate", "meta_P"]).transform(lambda x: asinh_scale(x, x.max()))
sc_df[adata.var_names[~adata.var.do_asinh].to_list()] = sc_df[adata.var_names[~adata.var.do_asinh]].groupby(["meta_replicate", "meta_P"]).transform(lambda x: scale(x))
adata = anndata.AnnData(X=sc_df, obs=adata.obs, var=adata.var, raw=adata.raw)
CPU times: user 1min 16s, sys: 1.01 s, total: 1min 17s Wall time: 1min 17s
def map_names(a):
return {
"feat_combined_sum_DAPI": "DAPI",
"feat_combined_sum_EGFP": "CD45",
"feat_combined_sum_RPe": "Siglec 8",
"feat_combined_sum_APC": "CD15",
}[a]
aligned_df = scanpy.get.obs_df(
adata, keys=adata.var_names[adata.var.is_marker].to_list()
).reset_index()
melted_df = pandas.melt(
aligned_df,
id_vars=["meta_P", "meta_replicate"],
value_vars=adata.var_names[adata.var.is_marker].to_list(),
)
melted_df.variable = melted_df.variable.apply(map_names)
grid = seaborn.FacetGrid(
data=melted_df,
col="meta_replicate",
row="variable",
sharey="row",
aspect=1.5,
margin_titles=True,
)
grid.map_dataframe(seaborn.stripplot, x="meta_P", y="value", size=1, alpha=0.5)
grid.set_axis_labels("Well image position", "Fluorescence intensity")
grid.set_titles(col_template="Replicate {col_name}", row_template="{row_name}")
grid.add_legend()
plt.savefig(output_intensity_distribution, bbox_inches='tight', pad_inches=0)
<seaborn.axisgrid.FacetGrid at 0x7fab8c380fd0>
def rediscritize(v):
bin_idx = numpy.digitize(v, bins=numpy.histogram_bin_edges(v))
bin2mu = [numpy.mean(v[bin_idx == i]) for i in range(1, numpy.max(bin_idx) + 1)]
return numpy.fromiter((bin2mu[i - 1] for i in bin_idx), dtype=float)
adata = adata[adata.to_df().isna().sum(axis=1) == 0].copy()
X = adata.X.copy()
X[:, discrete_cols_i] = numpy.apply_along_axis(rediscritize, 0, X[:, discrete_cols_i])
/srv/scratch/maximl/mambaforge/envs/scip/lib/python3.9/site-packages/numpy/core/fromnumeric.py:3474: RuntimeWarning: Mean of empty slice. return _methods._mean(a, axis=axis, dtype=dtype, /srv/scratch/maximl/mambaforge/envs/scip/lib/python3.9/site-packages/numpy/core/_methods.py:189: RuntimeWarning: invalid value encountered in true_divide ret = ret.dtype.type(ret / rcount)
%%time
with warnings.catch_warnings():
warnings.simplefilter("ignore")
mi_post = mutual_info_classif(X=X, y=adata.obs["meta_replicate"], discrete_features=discrete_cols_i, n_neighbors=30, random_state=0)
mi_post = pandas.Series(mi_post, index=adata.var_names).sort_values(ascending=False)
CPU times: user 3min 44s, sys: 1.28 s, total: 3min 45s Wall time: 3min 45s
kneedle = KneeLocator(
numpy.arange(len(mi_post)),
mi_post,
S=40,
curve="convex",
direction="decreasing",
online=False,
)
elbow_value = mi_post.iloc[int(kneedle.knee)]
kneedle.plot_knee()
selected_mi = mi_post[mi_post < elbow_value].index.values
len(selected_mi) / len(mi_post)
0.8460721868365181
len(mi_post[mi_post > elbow_value].index.values)
144
adata2 = adata[:, selected_mi].copy()
corr = scipy.stats.spearmanr(adata2.X).correlation
seaborn.clustermap(corr, vmin=-1, vmax=1, center=0)
/srv/scratch/maximl/mambaforge/envs/scip/lib/python3.9/site-packages/seaborn/matrix.py:657: UserWarning: Clustering large matrix with scipy. Installing `fastcluster` may give better performance. warnings.warn(msg)
<seaborn.matrix.ClusterGrid at 0x7fab5c1273d0>
corr = (corr + corr.T) / 2
numpy.fill_diagonal(corr, 1)
def feat_from_corr(corr):
# We convert the correlation matrix to a distance matrix before performing
# hierarchical clustering using Ward's linkage.
distance_matrix = 1 - numpy.abs(corr)
dist_linkage = hierarchy.ward(squareform(distance_matrix))
clusters = fcluster(dist_linkage, 0.1, criterion="distance").ravel()
indices = []
for c in numpy.unique(clusters):
ci = numpy.flatnonzero(clusters == c)
indices.append(ci[adata2.X[:, ci].std(axis=0).argmax()])
features = [adata2.var_names[i] for i in indices]
return features, indices, clusters
features, indices, clusters = feat_from_corr(corr)
seaborn.clustermap(corr[indices, :][:, indices], vmin=-1, vmax=1, center=0)
<seaborn.matrix.ClusterGrid at 0x7fab889c4550>
adata2.var["selected_corr"] = False
adata2.var.loc[features, "selected_corr"] = True
adata2.var["feature_clusters"] = clusters
scanpy.pp.scale(adata2)
adata3 = adata2[:, adata2.var.selected_corr].copy()
scanpy.tl.pca(adata3, svd_solver="arpack", random_state=0)
scanpy.pp.neighbors(adata3, n_neighbors=10, method="umap", random_state=0)
computing PCA
with n_comps=50
finished (0:00:00)
computing neighbors
using 'X_pca' with n_pcs = 50
finished: added to `.uns['neighbors']`
`.obsp['distances']`, distances for each pair of neighbors
`.obsp['connectivities']`, weighted adjacency matrix (0:00:03)
%%time
resolutions = [0.5, 0.75]
for res in resolutions:
scanpy.tl.leiden(adata3, resolution=res, key_added=f"leiden_{res}", random_state=0)
running Leiden clustering
finished: found 9 clusters and added
'leiden_0.5', the cluster labels (adata.obs, categorical) (0:00:06)
running Leiden clustering
finished: found 10 clusters and added
'leiden_0.75', the cluster labels (adata.obs, categorical) (0:00:07)
CPU times: user 13.4 s, sys: 314 ms, total: 13.7 s
Wall time: 13.6 s
scanpy.tl.umap(adata3, random_state=0)
computing UMAP
finished: added
'X_umap', UMAP coordinates (adata.obsm) (0:00:29)
scanpy.pl.umap(
adata3,
color=["leiden_0.5", "leiden_0.75", "meta_replicate"],
legend_loc="on data",
)
adata3.obs["leiden"] = adata3.obs["leiden_0.75"]
seaborn.countplot(data=adata3.obs, x="leiden", hue="meta_replicate")
<AxesSubplot:xlabel='leiden', ylabel='count'>
adata_out = anndata.AnnData(
X=adata2.X, var=adata2.var, obs=adata3.obs, obsm=adata3.obsm, raw=adata2.raw
)
import pickle
with open(output, "wb") as fh:
pickle.dump(adata_out, fh)